# Project 10 - BusinessAnalytics - DSA (part of Formação cientista de dados)
# Forecasting products that users will buy on their next order

# At this project, we will analyse datasets provided by Instacart in their Kaggle competition.
# The idea is to understand users' behavior, how products are related and than forcast what
# product will be in the next user' order. For that, we have a dataset with the last
# users' orders, a dataset with the last order and information about products. Price and
# quantity are not avaliable.

# Define folder
setwd("C:/Cursos/FCD/05-BusinessAnalytics/Cap10-ProjetosFeedback/Projeto10")
getwd()
## [1] "C:/Cursos/FCD/05-BusinessAnalytics/Cap10-ProjetosFeedback/Projeto10"
library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(scales)
library(knitr)
library(arules)
## Warning: package 'arules' was built under R version 3.6.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 3.6.3
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Warning: package 'pROC' was built under R version 3.6.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.6.3
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(stringr)

# Datasets ---------------------------------------------------------------------------
dfAisles <- fread("aisles.csv")
dfDepartments <- fread("departments.csv")
dfOrderProductsPrior <- fread("order_products__prior.csv")
dfOrderProductsTrain <- fread("order_products__train.csv")
dfOrders <- fread("orders.csv")
dfProducts <- fread("products.csv")

# Jutando os datasets
df <- left_join(dfOrderProductsPrior,dfProducts,by="product_id")
df <- left_join(df,dfAisles,by="aisle_id")
df <- left_join(df,dfDepartments,by="department_id")
df <- left_join(df,dfOrders,by="order_id")
df <- df[, c(1,2,10,5,8,9,3,4,12,13,14,15)]

dfOrderProductsTrain <- left_join(dfOrderProductsTrain,dfOrders, by='order_id')

dfProducts <- dfProducts %>% left_join(dfAisles) %>% left_join(dfDepartments) %>% 
  select(-aisle_id, -department_id)
## Joining, by = "aisle_id"
## Joining, by = "department_id"
rm(dfAisles, dfDepartments, dfOrderProductsPrior, dfOrders)

# Exploratory analisys --------------------------------------------------------------

# All Products and departments
dfProducts <- dfProducts %>% arrange(department, aisle)
dfProducts$aisle <- factor(dfProducts$aisle, levels=unique(dfProducts$aisle))

print(paste('Number of departments:', length(unique(dfProducts$department))))
## [1] "Number of departments: 21"
dfProducts %>%
  group_by(department,aisle) %>%
  summarize(contar = n()) %>%
  mutate(perc = contar / length(dfProducts$product_id)) %>%
  select(-contar) %>%
  ggplot() + geom_bar(aes(department, perc, fill=aisle), stat="identity", colour="black") + 
  labs(title="Number of products vs department", x="Department", y="% of products") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = 'bottom') + 
  scale_y_continuous(labels = percent) + guides(fill=guide_legend(ncol=10)) + 
  annotate("text", x = 3, y = 0.11, label = paste(format(length(unique(dfProducts$product_id)), big.mark='.')," products"), fontface = 2, size = 12)
## Warning in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, : 'big.mark' and 'decimal.mark' are both '.', which could be confusing

# Department orders
df %>%
  group_by(department) %>%
  summarize(contar = n()) %>%
  mutate(perc = contar / sum(contar)) %>%
  mutate(highlight = ifelse(department == 'produce' | department == 'dairy eggs' | department == 'snacks', F,T)) %>%
  ggplot + geom_bar(aes(x=reorder(department,-perc), y=perc, fill=highlight), stat="identity") + 
  labs(title="Department orders", x="Department", y="% of orders") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") + 
  scale_y_continuous(labels = percent) + 
  annotate("text", x = 6.5, y = 0.2, label = "3 departments represents more than 50% of all orders", fontface = 2, size = 8, color="#0f4c81") + 
  scale_fill_manual(values=c("#0f4c81", "#999999"))

# TOP 20 aisle
df %>%
  group_by(aisle) %>%
  summarize(contar = n()) %>%
  mutate(perc = contar / sum(contar)) %>%
  top_n(20) %>%
  ggplot + geom_bar(aes(x=reorder(aisle,-perc), y=perc), stat="identity") + 
    labs(title="TOP 20 - Aisle", x="Aisle", y="% of orders") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
    scale_y_continuous(labels = percent) + 
    annotate("text", x = 10, y = 0.07, label = "Top 20 aisle represents 60% of all orders", fontface = 2, size = 8,)
## Selecting by perc

# Top 20 products
df %>%
  group_by(product_name) %>%
  summarize(contar = n()) %>%
  mutate(perc = contar / sum(contar)) %>%
  top_n(20) %>%
  ggplot + geom_bar(aes(x=reorder(product_name,-perc), y=perc), stat="identity") + 
  labs(title="TOP 20 - Products", x="Product", y="% of orders") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_y_continuous(labels = percent)
## Selecting by perc

# Clients' number of orders
df %>%
  select(order_id, user_id) %>%
  group_by(user_id) %>%
  distinct() %>%
  summarize(numOrdens = n()) %>%
  group_by(numOrdens) %>%
  summarise(contar = n()) %>%
  mutate(perc = contar / sum(contar)) %>%
  ggplot + geom_bar(aes(x=numOrdens, y=perc), stat="identity") + 
  labs(title="Clients' number of orders", x="# of orders", y="% of clients") + 
  scale_y_continuous(labels = percent) + scale_x_continuous(breaks = seq(0, 100, by = 2))

# The minimum number of orders a client did was 3 on dataset. Some few clients ordered 100 times. 

# Day of week vs hour orders
df %>%
  select(order_id, order_dow, order_hour_of_day) %>%
  distinct() %>%
  group_by(order_dow, order_hour_of_day) %>%
  summarise(contar = n()) %>%
  ggplot() + geom_tile(aes(order_dow, order_hour_of_day, fill=contar)) + 
  scale_x_continuous(breaks = seq(0, 6, by = 1)) + scale_y_continuous(breaks = seq(0, 23, by = 1)) +
  labs(title="Day of week vs hour orders", x="Day of week", y="Hour")

# We can see that most clients order between 13h and 15h, on saturday and between 9h and 10h
# on saunday.

# Products by add order
df %>%
  group_by(add_to_cart_order, product_name) %>%
  summarize(contar = n()) %>%
  mutate(perc = contar / sum(contar)) %>%
  arrange(add_to_cart_order,desc(contar)) %>%
  filter(add_to_cart_order <= 5) %>%
  top_n(5) %>%
  ggplot + geom_bar(aes(x=reorder(product_name,-perc), y=perc, fill=product_name), stat="identity") + 
    facet_grid(rows=vars(add_to_cart_order)) + 
    labs(title="TOP 5 - Products by add order", x="Products", y="% of orders") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") + 
    scale_y_continuous(labels = percent) + 
    scale_fill_manual(values=c("#ffef99", "#ffeb7f", "#8dc63f", "#065535", "#ff5f5f", "white"))
## Selecting by perc

# Order distribution by number of products
df %>%
  group_by(order_id) %>%
  summarize(PorcReordered = sum(reordered) / n(), numProdutosNaOrdem = n()) %>%
  group_by(numProdutosNaOrdem) %>%
  summarise(qtOrdens = n(), PorcReorderedFull = mean(PorcReordered)) %>%
  mutate(perc = qtOrdens / sum(qtOrdens)) %>%
  mutate(PorcReordered = PorcReorderedFull * perc) %>%
  filter(numProdutosNaOrdem <= 50) %>%
  ggplot + geom_bar(aes(x=numProdutosNaOrdem, y=perc), stat="identity") + 
  geom_bar(aes(x=numProdutosNaOrdem, y=PorcReordered), stat="identity", fill = "#0f4c81") +
  labs(title="Order distribution by number of products (% reordered in blue)", x="# products", y="% of orders") + 
  scale_y_continuous(labels = percent) + scale_x_continuous(breaks = seq(1, 50, by = 1)) + 
  annotate("rect", xmin = 0.5, xmax = 8.5, ymin = -0.005, ymax = .075, alpha= .2) + 
  annotate("text", x = 13, y = 0.07, label = "50% of orders", fontface = 2, size = 8, color="#696969") +
  annotate("text", x = 24, y = 0.02, label = "Mean: 60% reordered", fontface = 2, size = 7, color="#0f4c81") +
  geom_text(aes(label = paste(format(PorcReorderedFull*100, digits=0),"%"), x=numProdutosNaOrdem, y=0), vjust=-0.8, color="white", size=3)

# Orders vs prior order in days
df %>%
  group_by(order_id, days_since_prior_order) %>%
  summarise(PorcReordered = sum(reordered) / n()) %>%
  group_by(days_since_prior_order) %>%
  summarise(qtOrdens = n(), PorcReorderedFull = mean(PorcReordered)) %>%
  mutate(perc = qtOrdens / sum(qtOrdens)) %>%
  mutate(PorcReordered = PorcReorderedFull * perc) %>%
  ggplot + geom_bar(aes(x=days_since_prior_order, y=perc), stat="identity") + 
  geom_bar(aes(x=days_since_prior_order, y=PorcReordered), stat="identity", fill = "#0f4c81") +
  geom_text(aes(label = paste(format(PorcReorderedFull*100, digits=0),"%"), x=days_since_prior_order, y=0), vjust=-0.8, color="white") +
  labs(title="% of orders vs days from prior order", x="Days from prior order", y="% of orders") + 
  scale_y_continuous(labels = percent) + scale_x_continuous(breaks = seq(0, 30, by = 1))
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

# We have to peaks. The first weekkly (7 days) and the other monthly (30 days). 

# Products bougth with many other
df %>%
  group_by(order_id) %>%
  summarize(numProdutosNaOrdem = n()) %>%
  left_join(df,by='order_id') %>%
  group_by(product_name) %>%
  summarise(numProdutosNaOrdemMedio = mean(numProdutosNaOrdem), numProductsSold = n()) %>%
  filter(numProductsSold > 2) %>%
  top_n(10, numProdutosNaOrdemMedio) %>%
  ggplot + geom_bar(aes(x=reorder(product_name,-numProdutosNaOrdemMedio), y=numProdutosNaOrdemMedio), stat="identity") + 
    labs(title="Products bougth with many other", x="Products", y="# products in order") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_y_continuous(labels = comma)

# Products bougth almost exclusive
df %>%
  group_by(order_id) %>%
  summarize(numProdutosNaOrdem = n()) %>%
  left_join(df,by='order_id') %>%
  group_by(product_name) %>%
  summarise(numProdutosNaOrdemMedio = mean(numProdutosNaOrdem), numProductsSold = n()) %>%
  filter(numProductsSold > 2) %>%
  top_n(-10, numProdutosNaOrdemMedio) %>%
  ggplot + geom_bar(aes(x=reorder(product_name,numProdutosNaOrdemMedio), y=numProdutosNaOrdemMedio), stat="identity") + 
  labs(title="Products bougth almost exclusive", x="Products", y="# products in order") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_y_continuous(labels = comma)

# Products most reordered
df %>%
  group_by(product_name) %>%
  summarise(numReordered = sum(reordered), numProductsSold = n()) %>%
  filter(numProductsSold > 2) %>%
  mutate(PorcReordered = numReordered / numProductsSold) %>%
  top_n(10, PorcReordered) %>%
  ggplot + geom_bar(aes(x=reorder(product_name,PorcReordered), y=PorcReordered), stat="identity") + 
  labs(title="Products most reordered", x="Products", y="% Reordered") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_y_continuous(labels = percent)

# Reordered - quantile and men  
dfTemp <- df %>%
  group_by(product_name) %>%
  summarise(numReordered = sum(reordered), numProductsSold = n()) %>%
  filter(numProductsSold > 2) %>%
  mutate(PorcReordered = numReordered / numProductsSold)

quantile(dfTemp$PorcReordered)
##        0%       25%       50%       75%      100% 
## 0.0000000 0.2142857 0.3808166 0.5303560 0.9411765
print(paste("Mean: ",mean(dfTemp$PorcReordered)))
## [1] "Mean:  0.369625043133485"
rm(dfTemp)

# Market Basket Analysis

# Generate a csv dataset
#dfSparse <- df %>%
#  group_by(order_id) %>%
#  select(order_id, product_name) %>%
#  mutate(product_name2 = paste0(product_name, collapse = ",")) %>%
#  select(order_id,product_name2) %>%
#  distinct()
#dfSparse$order_id <- NULL
#write.csv(dfSparse,"market_basket_transactions.csv", quote = FALSE, row.names = FALSE)
#rm(dfSparse)

# Import as transaction
tr <- read.transactions('market_basket_transactions.csv', format = 'basket', sep=',')
## Warning in asMethod(object): removing duplicated items in transactions
summary(tr)
## transactions as itemMatrix in sparse format with
##  3214875 rows (elements/itemsets/transactions) and
##  281851 columns (items) and a density of 3.520976e-05 
## 
## most frequent items:
##                 Banana Bag of Organic Bananas   Organic Strawberries   Organic Baby Spinach   Organic Hass Avocado                (Other) 
##                 460485                 367637                 252559                 232235                 205061               30386140 
## 
## element (itemset/transaction) length distribution:
## sizes
##      1      2      3      4      5      6      7      8      9     10     11     12     13     14     15     16     17     18     19     20     21     22     23     24     25     26     27     28     29     30     31     32     33     34     35     36     37     38     39     40     41     42     43     44     45     46     47     48     49     50     51     52     53     54     55     56     57     58     59     60     61     62     63     64     65     66     67     68     69     70     71     72     73     74     75     76     77     78     79     80     81     82     83     84     85     86     87     88     89     90     91     92     93     94     95     96     97     98     99    100    101    102    105    110    111    112    113    114    119    139    150 
## 173997 198879 213972 224853 227409 225230 216030 200441 182058 163733 145878 129947 115507 101932  89900  79491  69553  61292  53686  46683  40789  35385  30654  26430  22919  19838  17001  14613  12638  10863   9192   7861   6780   5783   5026   4267   3586   3090   2604   2184   1828   1556   1371   1141   1014    898    729    645    513    459    376    336    242    226    205    176    166    116    109     84     83     78     69     54     51     44     37     30     33     27     14     13     26     16     17     11     11     11      8      6      9      2      3      4      5      2      2      4      5      1      2      1      1      2      3      1      1      4      2      1      1      4      2      1      2      1      1      1      2      1      1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   4.000   8.000   9.924  13.000 150.000 
## 
## includes extended item information - examples:
##                         labels
## 1                           #2
## 2            #2 Coffee Filters
## 3 #2 Cone White Coffee Filters
# Calculate association rules based on min support and confidence
regrasAssociacao <- apriori(tr, parameter = list(supp=0.001, conf=0.8,maxlen=10))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target   ext
##         0.8    0.1    1 none FALSE            TRUE       5   0.001      1     10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 3214 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[281851 item(s), 3214875 transaction(s)] done [20.45s].
## sorting and recoding items ... [1750 item(s)] done [0.45s].
## creating transaction tree ... done [4.07s].
## checking subsets of size 1 2 3 4 done [1.23s].
## writing ... [231 rule(s)] done [0.00s].
## creating S4 object  ... done [0.69s].
# 231 rules created. Let's take a look on the top 10.
inspect(regrasAssociacao[1:10])
##      lhs                                                 rhs                           support     confidence lift     count
## [1]  {Medium Pulp}                                    => {Country Stand Juice}         0.001146856 1.0000000  871.9487 3687 
## [2]  {Country Stand Juice}                            => {Medium Pulp}                 0.001146856 1.0000000  871.9487 3687 
## [3]  {One-Ply}                                        => {Paper Towels Choose-A-Sheet} 0.001074692 1.0000000  930.4993 3455 
## [4]  {Paper Towels Choose-A-Sheet}                    => {One-Ply}                     0.001074692 1.0000000  930.4993 3455 
## [5]  {One-Ply}                                        => {Mega Rolls}                  0.001074692 1.0000000  912.7981 3455 
## [6]  {Mega Rolls}                                     => {One-Ply}                     0.001074692 0.9809767  912.7981 3455 
## [7]  {Paper Towels Choose-A-Sheet}                    => {Mega Rolls}                  0.001074692 1.0000000  912.7981 3455 
## [8]  {Mega Rolls}                                     => {Paper Towels Choose-A-Sheet} 0.001074692 0.9809767  912.7981 3455 
## [9]  {2 Huge Rolls = 5 Regular Rolls  Towels/Napkins} => {Select-A-Size Paper Towels}  0.001219954 1.0000000  605.4379 3922 
## [10] {2 Huge Rolls = 5 Regular Rolls  Towels/Napkins} => {White}                       0.001219954 1.0000000  474.2403 3922
# Remove redundant rules
regrasSubset <- which(colSums(is.subset(regrasAssociacao, regrasAssociacao)) > 1) # get subset rules in vector
length(regrasSubset)
## [1] 178
regrasAssociacaoSubSet <- regrasAssociacao[-regrasSubset]

# We removed 178 rules, remaning 53. Avarage support of 0.0011 and confidence from 0.8 to 1
inspect(regrasAssociacaoSubSet)
##      lhs                                                 rhs                                    support     confidence lift      count
## [1]  {2 Huge Rolls = 5 Regular Rolls  Towels/Napkins} => {Select-A-Size Paper Towels}           0.001219954 1.0000000  605.43785  3922
## [2]  {2 Huge Rolls = 5 Regular Rolls  Towels/Napkins} => {White}                                0.001219954 1.0000000  474.24030  3922
## [3]  {98% Fat Free}                                   => {Gluten Free}                          0.001034566 0.9904705  248.16763  3326
## [4]  {Chocolate Chip Walnut}                          => {Cookies}                              0.001086512 1.0000000  679.82132  3493
## [5]  {Organic Snack Mix Bunnies Snack Mix}            => {Organic}                              0.001049185 1.0000000   46.87773  3373
## [6]  {Party Size}                                     => {Simply Naked}                         0.001107975 0.9024576  537.47467  3562
## [7]  {Party Size}                                     => {Pita Chips}                           0.001107975 0.9024576  520.50381  3562
## [8]  {Ginger Root Beer}                               => {Naturally Flavored Zero Calorie Soda} 0.001273144 1.0000000  596.45176  4093
## [9]  {Ginger Root Beer}                               => {Caffeine Free}                        0.001273144 1.0000000  496.65920  4093
## [10] {Roja}                                           => {Hot}                                  0.001120728 1.0000000  674.11931  3603
## [11] {Roja}                                           => {Salsa}                                0.001120728 1.0000000  577.59163  3603
## [12] {Americano}                                      => {Prosciutto}                           0.001294918 1.0000000  356.57442  4163
## [13] {Fat Free}                                       => {Milk}                                 0.001211867 0.8171141   63.41540  3896
## [14] {Select-A-Size Paper Towels}                     => {White}                                0.001581088 0.9572505  453.96675  5083
## [15] {No Salt Added}                                  => {Organic Tomato Paste}                 0.001194448 0.9149392  141.90541  3840
## [16] {Kalamata}                                       => {Olives}                               0.001168319 1.0000000  652.63398  3756
## [17] {Kalamata}                                       => {Organic}                              0.001164586 0.9968051   46.72796  3744
## [18] {97% Fat Free}                                   => {Gluten Free}                          0.001212489 0.9723123  243.61800  3898
## [19] {Citrus}                                         => {Organic Raw}                          0.001331311 0.9891380  577.85843  4280
## [20] {Citrus}                                         => {Kombucha}                             0.001331311 0.9891380  546.29015  4280
## [21] {Jalapeno Lime}                                  => {Tortilla Chips}                       0.001307982 1.0000000  275.29329  4205
## [22] {Almondmilk Creamer}                             => {Vanilla}                              0.001609705 1.0000000  354.52966  5175
## [23] {Pitted}                                         => {Olives}                               0.001164898 0.9067797  591.79522  3745
## [24] {Pitted}                                         => {Organic}                              0.001251371 0.9740920   45.66323  4023
## [25] {Brown Rice}                                     => {Tortillas}                            0.001299273 0.8217588  200.32240  4177
## [26] {and Pear Baby Food}                             => {Mango}                                0.001149034 1.0000000  524.36389  3694
## [27] {Happy Baby Spinach}                             => {Mango}                                0.001149034 1.0000000  524.36389  3694
## [28] {Clasico}                                        => {Tortilla Chips}                       0.001531319 1.0000000  275.29329  4923
## [29] {Country Buttermilk}                             => {Bread}                                0.001862281 1.0000000  173.77703  5987
## [30] {Fruit Spread}                                   => {Strawberry}                           0.001347175 0.8537355   78.05292  4331
## [31] {Whole Peeled}                                   => {Tomatoes}                             0.001851394 1.0000000  418.93081  5952
## [32] {Apricot & Banana Stage 2 Baby Food}             => {Peach}                                0.001478440 1.0000000  263.06153  4753
## [33] {Hope}                                           => {Hummus}                               0.001728528 1.0000000  405.20229  5557
## [34] {Original Recipe}                                => {Hummus}                               0.001728528 0.9940966  402.81022  5557
## [35] {Reduced Fat}                                    => {2% Milkfat}                           0.002277227 0.9824208  156.38543  7321
## [36] {Reduced Fat}                                    => {Milk}                                 0.002277849 0.9826892   76.26552  7323
## [37] {Crispy Wheat}                                   => {Crackers}                             0.002190754 1.0000000  190.54499  7043
## [38] {Italian (Flat)}                                 => {Parsley}                              0.002594502 1.0000000  223.03837  8341
## [39] {New England Grown}                              => {Parsley}                              0.002595746 0.9449666  210.76380  8345
## [40] {Organic Milk Reduced Fat}                       => {2% Milkfat}                           0.003844940 1.0000000  159.18375 12361
## [41] {Lowfat}                                         => {Yogurt}                               0.003067615 0.9642159  128.48518  9862
## [42] {Lowfat}                                         => {Strawberry}                           0.002608500 0.8199061   74.96007  8386
## [43] {Organic Butterhead (Boston}                     => {Butter}                               0.003595785 1.0000000  165.11093 11560
## [44] {Strained Low-Fat}                               => {Yogurt}                               0.003604495 1.0000000  133.25354 11588
## [45] {Bibb) Lettuce}                                  => {Butter}                               0.003736382 1.0000000  165.11093 12012
## [46] {Flat Parsley}                                   => {Bunch}                                0.004006377 1.0000000   74.04816 12880
## [47] {Coconut}                                        => {Yogurt}                               0.003605117 0.9501558  126.61162 11590
## [48] {Super Spinach! Baby Spinach}                    => {Baby Bok Choy}                        0.004280104 1.0000000  186.17530 13760
## [49] {Sweet Baby Kale}                                => {Baby Bok Choy}                        0.004280104 1.0000000  186.17530 13760
## [50] {YoKids Squeezers Organic Low-Fat Yogurt}        => {Strawberry}                           0.005730549 1.0000000   91.42518 18423
## [51] {Vitamin D}                                      => {Milk}                                 0.008704226 0.9758334   75.73345 27983
## [52] {Organic Red Radish}                             => {Bunch}                                0.008047902 1.0000000   74.04816 25873
## [53] {Bag}                                            => {Clementines}                          0.011584276 0.8654892   41.38749 37242
# Transforme in dataframe to posterior use
dfAssociationRules = DATAFRAME(regrasAssociacaoSubSet)
dfAssociationRules$LHS <- str_sub(dfAssociationRules$LHS,2,str_length(dfAssociationRules$LHS)-1)
dfAssociationRules$RHS <- str_sub(dfAssociationRules$RHS,2,str_length(dfAssociationRules$RHS)-1)
dfAssociationRules <- dfAssociationRules %>% mutate(Rule1=0, Rule2=0)
dfAssociationRules <- dfAssociationRules %>% group_by(LHS) %>% summarise(n=n()) %>% left_join(dfAssociationRules)
## Joining, by = "LHS"
item = 1
while (item <= length(dfAssociationRules$LHS)){
  dfAssociationRules[item,'Rule1']=dfAssociationRules[item,'RHS']
  if (dfAssociationRules[item,2] == 2){
    dfAssociationRules[item,'Rule2']=dfAssociationRules[item+1,'RHS']
    dfAssociationRules[item + 1,'Rule1']=dfAssociationRules[item,'RHS']
    dfAssociationRules[item + 1,'Rule2']=dfAssociationRules[item+1,'RHS']
    item = item + 1
  }
  item = item + 1
}

dfTemp <- dfAssociationRules %>% select(RHS, Rule1, Rule2) %>% distinct()
dfTemp <- dfTemp %>% group_by(RHS) %>% summarise(n=n()) %>% left_join(dfTemp)
## Joining, by = "RHS"
dfTemp <- dfTemp[!(dfTemp$n==2 & dfTemp$Rule2==0),]
names(dfTemp)[1] <- "product_name"
dfTemp$n <- NULL

dfAssociationRulesFinal <- dfAssociationRules %>% select(LHS, Rule1, Rule2) %>% distinct()
names(dfAssociationRulesFinal)[1] <- "product_name"
dfAssociationRulesFinal <- bind_rows(dfAssociationRulesFinal,dfTemp)

# Data Viz
subRegras<-regrasAssociacao[quality(regrasAssociacao)$confidence>0.4]
# Plot SubRules
# The majority rules are located up left and  support is smaller than 0.005 bps.
plot(subRegras, engine = "htmlwidget")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# The color represents the number of itens in the rule
plot(subRegras,method="two-key plot", engine = "htmlwidget")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
# This graph shows the rules as a chain. It is possible to see the entire rule.
top10subRules <- head(subRegras, n = 10, by = "confidence")
plot(top10subRules, method = "graph",  engine = "htmlwidget")
# This graph shows as buying a product follows to another.
# 2 - The most recent addition
# 1 - The item I had
subRegras2<-head(subRegras, n=20, by="lift")
plot(subRegras2, method="paracoord")

# Some rules such as Hue Rolls -> Paper Towels have great support and lift
plot(top10subRules, method = "grouped")

rm(tr, regrasAssociacao, regrasAssociacaoSubSet, subRegras, subRegras2, top10subRules, regrasSubset)

# Model-----------------------------------------------------------------------------------------

# Creating indicators for products
# % Reordered
dfTemp <- df %>% group_by(product_id) %>% summarise(numReorderedMeanPrd = sum(reordered)/n())
dfProducts <- left_join(dfProducts, dfTemp)
## Joining, by = "product_id"
# Number products in orders the product is included mean/std
dfTemp <- df %>%
  group_by(order_id) %>%
  summarize(numPrdOrdem = n()) %>%
  left_join(df) %>%
  group_by(product_name) %>%
  summarise(numPrdOrderMeanPrd = mean(numPrdOrdem),
            numPrdOrderStdPrd = sd(numPrdOrdem))
## Joining, by = "order_id"
dfProducts <- left_join(dfProducts, dfTemp)
## Joining, by = "product_name"
# Day of week, hour, add to cart, order number mean/std
dfTemp <- df %>%
  group_by(product_id) %>%
  summarize(
    dayOfWeekMeanPrd = mean(order_dow),
    dayOfWeekStdPrd = sd(order_dow),
    hourDayMeanPrd = mean(order_hour_of_day),
    hourDayStdPrd = sd(order_hour_of_day),
    addCartMeanPrd = mean(add_to_cart_order),
    addCartStdPrd = sd(add_to_cart_order),
    orderNumberMeanPrd = mean(order_number),
    orderNumberStdPrd = sd(order_number))
dfProducts <- left_join(dfProducts, dfTemp)
## Joining, by = "product_id"
# Days since prior order
dfTemp <- df %>% filter(is.na(days_since_prior_order) == FALSE) %>%
  group_by(product_id) %>%
  summarize(
    daysPriorOrderMeanPrd = mean(days_since_prior_order),
    daysPriorOrderStdPrd = sd(days_since_prior_order))
dfProducts <- left_join(dfProducts, dfTemp)
## Joining, by = "product_id"
# How many times de product was sold
dfTemp <- df %>% group_by(product_id) %>% summarise(NumSoldPrd = n())
dfProducts <- left_join(dfProducts, dfTemp)
## Joining, by = "product_id"
# How many users bought the product
dfTemp <- df %>% group_by(product_id, user_id) %>% summarise(n = n()) %>% 
                 group_by(product_id) %>% summarise(NumUserBougthPrd=n())
dfProducts <- left_join(dfProducts, dfTemp)
## Joining, by = "product_id"
# Creating indicators for users
# % Reordered mean/std
dfUsers <- df %>% group_by(user_id, order_id) %>% summarise(n = sum(reordered)/n()) %>%
  group_by(user_id) %>% summarise(numReorderedMeanUser = mean(n), numReorderedMStdUser = sd(n))
# Dow of last order
dfUsers <- dfOrderProductsTrain %>% select(user_id, order_dow) %>% distinct() %>% right_join(dfUsers)
## Joining, by = "user_id"
names(dfUsers)[2] <- "DowLastOrderUser"
# Hour of last order
dfUsers <- dfOrderProductsTrain %>% select(user_id, order_hour_of_day) %>% distinct() %>% right_join(dfUsers)
## Joining, by = "user_id"
names(dfUsers)[2] <- "HourLastOrderUser"
# Days since prior order of last order
dfUsers <- dfOrderProductsTrain %>% select(user_id, days_since_prior_order) %>% distinct() %>% right_join(dfUsers)
## Joining, by = "user_id"
names(dfUsers)[2] <- "DaysSincePriorLastOrderUser"
# Number products in orders the user does
dfTemp <- df %>%
  group_by(order_id) %>%
  summarize(numPrdOrdem = n()) %>%
  left_join(df) %>%
  group_by(user_id) %>%
  summarise(numPrdMeanUser = mean(numPrdOrdem), numPrdStdUser = sd(numPrdOrdem))
## Joining, by = "order_id"
dfUsers <- left_join(dfUsers, dfTemp)
## Joining, by = "user_id"
# Day of week, hour, add to cart, order number avegare/std
dfTemp <- df %>%
  group_by(user_id) %>%
  summarize(
    dayOfWeekMeanUser = mean(order_dow),
    dayOfWeekStdUser = sd(order_dow),
    hourDayMeanUser = mean(order_hour_of_day),
    hourDayStdUser = sd(order_hour_of_day),
    addCartMeanUser = mean(add_to_cart_order),
    addCartStdUser = sd(add_to_cart_order),
    orderNumberMeanUser = mean(order_number),
    orderNumberStdUser = sd(order_number))
dfUsers <- left_join(dfUsers, dfTemp)
## Joining, by = "user_id"
# Days since prior order average/std
dfTemp <- df %>% filter(is.na(days_since_prior_order) == FALSE) %>%
  group_by(user_id) %>%
  summarize(
    daysPriorOrderMeanUser = mean(days_since_prior_order),
    daysPriorOrderStdUser = sd(days_since_prior_order))
dfUsers <- left_join(dfUsers, dfTemp)
## Joining, by = "user_id"
# How many times the client has ordered
dfTemp <- df %>% group_by(order_id,user_id) %>% summarise(NumPrd = n()) %>% group_by(user_id) %>% summarise(NumOrderedUser=n())
dfUsers <- left_join(dfUsers, dfTemp)
## Joining, by = "user_id"
# How many products the user bought
dfTemp <- df %>% group_by(user_id) %>% summarise(NumPrdUser = n())
dfUsers <- left_join(dfUsers, dfTemp)
## Joining, by = "user_id"
# Gerando o dataset final
dfFinal <- df %>% select(user_id, product_id) %>% distinct()
dfFinal <- left_join(dfFinal,dfProducts, by='product_id')
dfFinal <- left_join(dfFinal,dfUsers, by='user_id')

# User-product indicators
# Number that user bougth the product
dfTemp <- df %>% group_by(user_id, product_id) %>% summarise(PU_NumUserBoutghPrd = n())
dfFinal <- left_join(dfFinal, dfTemp, by=c('user_id','product_id'))
# Number or orders user did after bougth the product by the last time
dfTemp <- df %>% group_by(user_id, product_id) %>% summarise(order_number = max(order_number))
dfTemp <- df %>% group_by(user_id) %>% summarise(order_numberMAX = max(order_number)) %>%
  left_join(dfTemp, by='user_id') %>% mutate(PU_OrdersSinceLastBougth = order_numberMAX-order_number)
dfTemp$order_number <- NULL
dfTemp$order_numberMAX <- NULL
dfFinal <- left_join(dfFinal, dfTemp, by=c('user_id','product_id'))
# Number or days passed after bougth the product by the last time: days and normalized by user mean
dfTemp2 <- df
dfTemp2$days_since_prior_order <- replace_na(dfTemp2$days_since_prior_order,0)
dfTemp <- dfTemp2 %>% group_by(user_id, product_id) %>% summarise(daysProduct = sum(days_since_prior_order))
dfTemp <- dfTemp2 %>% group_by(user_id, order_id) %>% summarise(n = sum(days_since_prior_order)/n()) %>%
  group_by(user_id) %>% summarise(daysProductMAX = sum(n)) %>%
  left_join(dfTemp, by='user_id') %>% mutate(PU_DaysSinceLastBougth = daysProductMAX-daysProduct)
rm(dfTemp2)
dfTemp$daysProductMAX <- NULL
dfTemp$daysProduct <- NULL

dfTemp <- left_join(dfTemp,dfUsers[,c('user_id','daysPriorOrderMeanUser')], by='user_id')
dfTemp <- dfTemp %>% mutate(PU_DaysSinceLastBougthNorm = PU_DaysSinceLastBougth/daysPriorOrderMeanUser)
dfTemp$daysPriorOrderMeanUser <- NULL
dfFinal <- left_join(dfFinal, dfTemp, by=c('user_id','product_id'))
# Boolean if product was ever reorderd by user
dfTemp <- df %>% group_by(user_id, product_id) %>% summarise(PU_PrdEverReorderedUser = max(reordered))
dfFinal <- left_join(dfFinal, dfTemp, by=c('user_id','product_id'))

# Add Market Basket Analysis for products
dfFinal <- left_join(dfFinal, dfAssociationRulesFinal, by='product_name')
dfFinal$Rule1 <- replace_na(dfFinal$Rule1,0)
dfFinal$Rule2 <- replace_na(dfFinal$Rule2,0)
rm(dfTemp)

# Verify if the product was bougth in last order by user (boolean)
dfUsersPrdTrain <- dfOrderProductsTrain %>% select(user_id, product_id) %>% distinct() %>% 
  mutate(boughtLastOrder=1)
dfFinal <- left_join(dfFinal, dfUsersPrdTrain, by=c('user_id', 'product_id'))

# Remove NAs
dfFinal <- dfFinal %>% replace(is.na(.), 0)
# Remove unnecessary columns
dfFinal$product_name <- NULL
dfFinal$aisle <- NULL
dfFinal$department <- NULL

# Normalize dataset
for (item in names(dfFinal[,-c(1,2,42,43,44)])){
  X <- dfFinal[[item]]
  dfFinal[[item]] <- (X - min(X)) / (max(X) - min(X))
}

# Transform to factor
dfFinal$boughtLastOrder <- as.factor(dfFinal$boughtLastOrder)
dfFinal$Rule1 <- as.factor(dfFinal$Rule1)
dfFinal$Rule2 <- as.factor(dfFinal$Rule2)

# Generate train/test users
dfUsersTrain <- dfOrderProductsTrain %>% select(user_id) %>% distinct()
trainIndex <- createDataPartition(dfUsersTrain$user_id, p = .7, list = FALSE, times = 1)
trainSet <- dfUsersTrain[trainIndex]
testSet  <- dfUsersTrain[-trainIndex]

trainSet <- dfFinal %>% filter(user_id %in% trainSet)
testSet <- dfFinal %>% filter(user_id %in% testSet)

# Remove unnecessary columns
trainSet$user_id <- NULL
trainSet$product_id <- NULL
testSet$user_id <- NULL
testSet$product_id <- NULL

rm(df, dfFinal, dfOrderProductsTrain, dfUsers, dfProducts, trainIndex, dfUsersPrdTrain, dfUsersTrain, item, X, dfAssociationRules, dfAssociationRulesFinal)

# Train models
#write.csv(trainSet,"trainSet.csv", quote = FALSE, row.names = FALSE)
#write.csv(testSet,"testSet.csv", quote = FALSE, row.names = FALSE)
trainSet <- fread("trainSet.csv")
testSet <- fread("testSet.csv")
trainSet$boughtLastOrder <- as.factor(trainSet$boughtLastOrder)
testSet$boughtLastOrder <- as.factor(testSet$boughtLastOrder)
trainSet$Rule1 <- as.factor(trainSet$Rule1)
trainSet$Rule2 <- as.factor(trainSet$Rule2)
testSet$Rule1 <- as.factor(testSet$Rule1)
testSet$Rule2 <- as.factor(testSet$Rule2)

# As it is an educational project and for computacional reasons, we will reduce the train dataset, so we can 
# run models faster. We will take the opportunity to balance the train dataset to 50/50.
dfTemp <- trainSet %>% filter(boughtLastOrder==1) %>% sample_n(15000)
trainSet <- trainSet %>% filter(boughtLastOrder==0) %>% sample_n(15000) %>% bind_rows(dfTemp)
trainSet <- sample_n(trainSet, 30000, replace=FALSE)

dfTemp <- testSet %>% filter(boughtLastOrder==1) %>% sample_n(5000)
testSetB <- testSet %>% filter(boughtLastOrder==0) %>% sample_n(5000) %>% bind_rows(dfTemp)
testSetB <- sample_n(testSetB, 10000, replace=FALSE)

testSet <- sample_n(testSet, 10000, replace=FALSE)

rm(dfTemp)

# Define cross validation
ctrl <- trainControl(method = "cv", number=5)
variaveisModelo <- as.formula(boughtLastOrder ~ .)

library(doParallel)
## Warning: package 'doParallel' was built under R version 3.6.3
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
cl <- makePSOCKcluster(5)
registerDoParallel(cl)


# Random Forest
modeloRF <- train(variaveisModelo, data=trainSet, method='rf', trControl=ctrl)

# Checking the most important parameters
plot(varImp(modeloRF))

# We can see that parameters regarind products and users are de most importants, while variables from
# market basket analysis are weak to predict

print(modeloRF)
## Random Forest 
## 
## 30000 samples
##    40 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 24000, 24000, 24000, 24000, 24000 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7233667  0.4467333
##   33    0.7427000  0.4854000
##   64    0.7418000  0.4836000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 33.
previsoes <- data.frame(observado = testSet$boughtLastOrder,
                        previsto = predict(modeloRF, newdata = testSet))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6702 2319
##          1  262  717
##                                           
##                Accuracy : 0.7419          
##                  95% CI : (0.7332, 0.7505)
##     No Information Rate : 0.6964          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2454          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##               Precision : 0.7429          
##                  Recall : 0.9624          
##                      F1 : 0.8385          
##              Prevalence : 0.6964          
##          Detection Rate : 0.6702          
##    Detection Prevalence : 0.9021          
##       Balanced Accuracy : 0.5993          
##                                           
##        'Positive' Class : 0               
## 
# We got a good accurancy, but only because the test dataset is really umbalanced. The model made wrong 
# prediciton for calss 1 (user ordered), making more mistakes than corrected forecasts. Let's use balanced
# test dataset and check accucary.

previsoes <- data.frame(observado = testSetB$boughtLastOrder,
                        previsto = predict(modeloRF, newdata = testSetB))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3676 1324
##          1 1246 3754
##                                           
##                Accuracy : 0.743           
##                  95% CI : (0.7343, 0.7515)
##     No Information Rate : 0.5078          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.486           
##                                           
##  Mcnemar's Test P-Value : 0.1288          
##                                           
##               Precision : 0.7352          
##                  Recall : 0.7469          
##                      F1 : 0.7410          
##              Prevalence : 0.4922          
##          Detection Rate : 0.3676          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.7431          
##                                           
##        'Positive' Class : 0               
## 
# Now, the overall accuracy have moved down, but the model was able to get more corrected predictions for
# Class 1. Let's try another model.

# Stochastic Gradient Boosting
modeloSGB <- train(variaveisModelo, data=trainSet, method='gbm', trControl=ctrl, verbose=FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 39: Rule12% Milkfat has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 44: Rule1Cookies has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 46: Rule1Hot has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 49: Rule1Milk has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 54: Rule1Select-A-Size Paper Towels has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 55: Rule1Tomatoes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 56: Rule1Tortilla Chips has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 57: Rule1Tortillas has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 58: Rule1Vanilla has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 59: Rule1White has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 61: Rule2Milk has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 62: Rule2Salsa has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 64: Rule2White has no variation.
print(modeloSGB)
## Stochastic Gradient Boosting 
## 
## 30000 samples
##    40 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 24000, 24000, 24000, 24000, 24000 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.7392667  0.4785333
##   1                  100      0.7432000  0.4864000
##   1                  150      0.7429667  0.4859333
##   2                   50      0.7425667  0.4851333
##   2                  100      0.7434000  0.4868000
##   2                  150      0.7437333  0.4874667
##   3                   50      0.7444667  0.4889333
##   3                  100      0.7446667  0.4893333
##   3                  150      0.7449000  0.4898000
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
previsoes <- data.frame(observado = testSet$boughtLastOrder,
                        previsto = predict(modeloSGB, newdata = testSet))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6813 2208
##          1  258  721
##                                           
##                Accuracy : 0.7534          
##                  95% CI : (0.7448, 0.7618)
##     No Information Rate : 0.7071          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2605          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##               Precision : 0.7552          
##                  Recall : 0.9635          
##                      F1 : 0.8468          
##              Prevalence : 0.7071          
##          Detection Rate : 0.6813          
##    Detection Prevalence : 0.9021          
##       Balanced Accuracy : 0.6048          
##                                           
##        'Positive' Class : 0               
## 
# The result has improved, but it keeps the same problems, caused but predominance of classes 0.
# With balanced test dataset.

previsoes <- data.frame(observado = testSetB$boughtLastOrder,
                        previsto = predict(modeloSGB, newdata = testSetB))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3760 1240
##          1 1301 3699
##                                           
##                Accuracy : 0.7459          
##                  95% CI : (0.7372, 0.7544)
##     No Information Rate : 0.5061          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.4918          
##                                           
##  Mcnemar's Test P-Value : 0.2339          
##                                           
##               Precision : 0.7520          
##                  Recall : 0.7429          
##                      F1 : 0.7474          
##              Prevalence : 0.5061          
##          Detection Rate : 0.3760          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.7459          
##                                           
##        'Positive' Class : 0               
## 
# The result improve a lot. Let's try anothe model.

# Extreme Gradient Boosting
modeloEGB <- train(variaveisModelo, data=trainSet, method='xgbLinear', trControl=ctrl)
print(modeloEGB)
## eXtreme Gradient Boosting 
## 
## 30000 samples
##    40 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 24000, 24000, 24000, 24000, 24000 
## Resampling results across tuning parameters:
## 
##   lambda  alpha  nrounds  Accuracy   Kappa    
##   0e+00   0e+00   50      0.7341000  0.4682000
##   0e+00   0e+00  100      0.7315667  0.4631333
##   0e+00   0e+00  150      0.7253667  0.4507333
##   0e+00   1e-04   50      0.7331667  0.4663333
##   0e+00   1e-04  100      0.7294333  0.4588667
##   0e+00   1e-04  150      0.7271333  0.4542667
##   0e+00   1e-01   50      0.7345000  0.4690000
##   0e+00   1e-01  100      0.7290000  0.4580000
##   0e+00   1e-01  150      0.7257667  0.4515333
##   1e-04   0e+00   50      0.7331667  0.4663333
##   1e-04   0e+00  100      0.7294333  0.4588667
##   1e-04   0e+00  150      0.7261667  0.4523333
##   1e-04   1e-04   50      0.7331667  0.4663333
##   1e-04   1e-04  100      0.7292000  0.4584000
##   1e-04   1e-04  150      0.7263000  0.4526000
##   1e-04   1e-01   50      0.7344667  0.4689333
##   1e-04   1e-01  100      0.7287667  0.4575333
##   1e-04   1e-01  150      0.7262333  0.4524667
##   1e-01   0e+00   50      0.7362000  0.4724000
##   1e-01   0e+00  100      0.7306000  0.4612000
##   1e-01   0e+00  150      0.7270667  0.4541333
##   1e-01   1e-04   50      0.7370667  0.4741333
##   1e-01   1e-04  100      0.7309667  0.4619333
##   1e-01   1e-04  150      0.7273333  0.4546667
##   1e-01   1e-01   50      0.7394667  0.4789333
##   1e-01   1e-01  100      0.7315667  0.4631333
##   1e-01   1e-01  150      0.7270333  0.4540667
## 
## Tuning parameter 'eta' was held constant at a value of 0.3
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, lambda = 0.1, alpha = 0.1 and eta = 0.3.
previsoes <- data.frame(observado = testSet$boughtLastOrder,
                        previsto = predict(modeloEGB, newdata = testSet))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6649 2372
##          1  262  717
##                                           
##                Accuracy : 0.7366          
##                  95% CI : (0.7278, 0.7452)
##     No Information Rate : 0.6911          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2394          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##               Precision : 0.7371          
##                  Recall : 0.9621          
##                      F1 : 0.8347          
##              Prevalence : 0.6911          
##          Detection Rate : 0.6649          
##    Detection Prevalence : 0.9021          
##       Balanced Accuracy : 0.5971          
##                                           
##        'Positive' Class : 0               
## 
# The result got worse.
# With balanced test sataset.

previsoes <- data.frame(observado = testSetB$boughtLastOrder,
                        previsto = predict(modeloEGB, newdata = testSetB))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3690 1310
##          1 1260 3740
##                                           
##                Accuracy : 0.743           
##                  95% CI : (0.7343, 0.7515)
##     No Information Rate : 0.505           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.486           
##                                           
##  Mcnemar's Test P-Value : 0.3338          
##                                           
##               Precision : 0.7380          
##                  Recall : 0.7455          
##                      F1 : 0.7417          
##              Prevalence : 0.4950          
##          Detection Rate : 0.3690          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.7430          
##                                           
##        'Positive' Class : 0               
## 
# Almost the same result as STB.

# Optimize the best model - Stochastic Gradient Boosting
ctrl <- trainControl(method = "cv", number=5)
grid <- expand.grid(interaction.depth = c(1, 10, 15),
                    n.trees = c(50, 300, 500),
                    shrinkage = c(.1, .3, .5),
                    n.minobsinnode = c(1, 5, 7))
modeloSGB <- train(variaveisModelo, data=trainSet, method='gbm', trControl=ctrl, tuneGrid=grid, verbose=FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 39: Rule12% Milkfat has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 44: Rule1Cookies has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 46: Rule1Hot has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 49: Rule1Milk has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 54: Rule1Select-A-Size Paper Towels has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 55: Rule1Tomatoes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 56: Rule1Tortilla Chips has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 57: Rule1Tortillas has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 58: Rule1Vanilla has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 59: Rule1White has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 61: Rule2Milk has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 62: Rule2Salsa has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 64: Rule2White has no variation.
print(modeloSGB)
## Stochastic Gradient Boosting 
## 
## 30000 samples
##    40 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 24000, 24000, 24000, 24000, 24000 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  n.trees  Accuracy   Kappa    
##   0.1         1                 1                50      0.7391000  0.4782000
##   0.1         1                 1               300      0.7440000  0.4880000
##   0.1         1                 1               500      0.7446000  0.4892000
##   0.1         1                 5                50      0.7386333  0.4772667
##   0.1         1                 5               300      0.7445333  0.4890667
##   0.1         1                 5               500      0.7440000  0.4880000
##   0.1         1                 7                50      0.7395000  0.4790000
##   0.1         1                 7               300      0.7444000  0.4888000
##   0.1         1                 7               500      0.7444000  0.4888000
##   0.1        10                 1                50      0.7472000  0.4944000
##   0.1        10                 1               300      0.7434000  0.4868000
##   0.1        10                 1               500      0.7410000  0.4820000
##   0.1        10                 5                50      0.7450333  0.4900667
##   0.1        10                 5               300      0.7435333  0.4870667
##   0.1        10                 5               500      0.7410667  0.4821333
##   0.1        10                 7                50      0.7466333  0.4932667
##   0.1        10                 7               300      0.7451333  0.4902667
##   0.1        10                 7               500      0.7404333  0.4808667
##   0.1        15                 1                50      0.7469667  0.4939333
##   0.1        15                 1               300      0.7427667  0.4855333
##   0.1        15                 1               500      0.7385000  0.4770000
##   0.1        15                 5                50      0.7455000  0.4910000
##   0.1        15                 5               300      0.7418667  0.4837333
##   0.1        15                 5               500      0.7376000  0.4752000
##   0.1        15                 7                50      0.7464667  0.4929333
##   0.1        15                 7               300      0.7420667  0.4841333
##   0.1        15                 7               500      0.7364667  0.4729333
##   0.3         1                 1                50      0.7427333  0.4854667
##   0.3         1                 1               300      0.7425000  0.4850000
##   0.3         1                 1               500      0.7413667  0.4827333
##   0.3         1                 5                50      0.7417333  0.4834667
##   0.3         1                 5               300      0.7436667  0.4873333
##   0.3         1                 5               500      0.7405333  0.4810667
##   0.3         1                 7                50      0.7424333  0.4848667
##   0.3         1                 7               300      0.7422000  0.4844000
##   0.3         1                 7               500      0.7417333  0.4834667
##   0.3        10                 1                50      0.7409333  0.4818667
##   0.3        10                 1               300      0.7194667  0.4389333
##   0.3        10                 1               500      0.7162333  0.4324667
##   0.3        10                 5                50      0.7423000  0.4846000
##   0.3        10                 5               300      0.7218667  0.4437333
##   0.3        10                 5               500      0.7173333  0.4346667
##   0.3        10                 7                50      0.7383667  0.4767333
##   0.3        10                 7               300      0.7184000  0.4368000
##   0.3        10                 7               500      0.7111333  0.4222667
##   0.3        15                 1                50      0.7371333  0.4742667
##   0.3        15                 1               300      0.7123333  0.4246667
##   0.3        15                 1               500      0.7080333  0.4160667
##   0.3        15                 5                50      0.7365333  0.4730667
##   0.3        15                 5               300      0.7155000  0.4310000
##   0.3        15                 5               500      0.7112333  0.4224667
##   0.3        15                 7                50      0.7353000  0.4706000
##   0.3        15                 7               300      0.7133333  0.4266667
##   0.3        15                 7               500      0.7093667  0.4187333
##   0.5         1                 1                50      0.7405333  0.4810667
##   0.5         1                 1               300      0.7396667  0.4793333
##   0.5         1                 1               500      0.7385667  0.4771333
##   0.5         1                 5                50      0.7421000  0.4842000
##   0.5         1                 5               300      0.7400667  0.4801333
##   0.5         1                 5               500      0.7395333  0.4790667
##   0.5         1                 7                50      0.7429000  0.4858000
##   0.5         1                 7               300      0.7408667  0.4817333
##   0.5         1                 7               500      0.7394667  0.4789333
##   0.5        10                 1                50      0.7269000  0.4538000
##   0.5        10                 1               300      0.6963333  0.3926667
##   0.5        10                 1               500      0.6904000  0.3808000
##   0.5        10                 5                50      0.7294000  0.4588000
##   0.5        10                 5               300      0.6955667  0.3911333
##   0.5        10                 5               500      0.6891333  0.3782667
##   0.5        10                 7                50      0.7294333  0.4588667
##   0.5        10                 7               300      0.6964000  0.3928000
##   0.5        10                 7               500      0.6925667  0.3851333
##   0.5        15                 1                50      0.7205000  0.4410000
##   0.5        15                 1               300      0.6911667  0.3823333
##   0.5        15                 1               500      0.6860333  0.3720667
##   0.5        15                 5                50      0.7185000  0.4370000
##   0.5        15                 5               300      0.6887333  0.3774667
##   0.5        15                 5               500      0.6891000  0.3782000
##   0.5        15                 7                50      0.7227000  0.4454000
##   0.5        15                 7               300      0.6943667  0.3887333
##   0.5        15                 7               500      0.6921000  0.3842000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth = 10, shrinkage = 0.1 and n.minobsinnode = 1.
# Best parameters
modeloSGB$bestTune
##    n.trees interaction.depth shrinkage n.minobsinnode
## 10      50                10       0.1              1
# Train the model with the best parameters
modeloSGB <- train(variaveisModelo, data=trainSet, method='gbm', trControl=ctrl, tuneGrid=modeloSGB$bestTune, verbose=FALSE)
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 39: Rule12% Milkfat has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 44: Rule1Cookies has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 46: Rule1Hot has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 49: Rule1Milk has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 54: Rule1Select-A-Size Paper Towels has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 55: Rule1Tomatoes has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 56: Rule1Tortilla Chips has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 57: Rule1Tortillas has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 58: Rule1Vanilla has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 59: Rule1White has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 61: Rule2Milk has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 62: Rule2Salsa has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution = "bernoulli", : variable 64: Rule2White has no variation.
print(modeloSGB)
## Stochastic Gradient Boosting 
## 
## 30000 samples
##    40 predictor
##     2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 24000, 24000, 24000, 24000, 24000 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7471667  0.4943333
## 
## Tuning parameter 'n.trees' was held constant at a value of 50
## Tuning parameter 'interaction.depth' was held constant at a value of 10
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## Tuning parameter 'n.minobsinnode' was held constant at a value of 1
previsoes <- data.frame(observado = testSet$boughtLastOrder,
                        previsto = predict(modeloSGB, newdata = testSet))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6721 2300
##          1  234  745
##                                          
##                Accuracy : 0.7466         
##                  95% CI : (0.738, 0.7551)
##     No Information Rate : 0.6955         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.2607         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##               Precision : 0.7450         
##                  Recall : 0.9664         
##                      F1 : 0.8414         
##              Prevalence : 0.6955         
##          Detection Rate : 0.6721         
##    Detection Prevalence : 0.9021         
##       Balanced Accuracy : 0.6055         
##                                          
##        'Positive' Class : 0              
## 
# Balanced test dataset

previsoes <- data.frame(observado = testSetB$boughtLastOrder,
                        previsto = predict(modeloSGB, newdata = testSetB))
confusionMatrix(previsoes$observado, previsoes$previsto,mode = "prec_recall")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3680 1320
##          1 1216 3784
##                                           
##                Accuracy : 0.7464          
##                  95% CI : (0.7378, 0.7549)
##     No Information Rate : 0.5104          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.4928          
##                                           
##  Mcnemar's Test P-Value : 0.04082         
##                                           
##               Precision : 0.7360          
##                  Recall : 0.7516          
##                      F1 : 0.7437          
##              Prevalence : 0.4896          
##          Detection Rate : 0.3680          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.7465          
##                                           
##        'Positive' Class : 0               
## 
# The final result is a little bit better than what we got before.
# In order to improve the performance, some more indicators related to User/Product
# may be created. Further, we could train the model full, with no cuts for computational
# reasons. Colaborative filtering techniques may be used as well. 

stopCluster(cl)